pacman::p_load(dplyr, purrr, readr, ggiraph,
ggplot2, lubridate, ggrepel,
patchwork, ggthemes, hrbrthemes, tidyverse, plotly, readr)Take-home_Ex02
DataVis Makeover

Hello,today we choose the analysis from ZOU JIAXUN.I’ll criticize you.![]()

This scatter plot, which displays the relationship between property area (in square feet) and transacted prices (in thousands of dollars), is created from a randomly sampled subset of 500 data points from a comprehensive dataset, rather than isolating data from the first quarter of 2024. This approach may not provide an accurate reflection of the specific market conditions of early 2024, as it includes data from various time periods with potentially different economic influences. Additionally, the plot does not differentiate between types of sales, such as new homes, resales, or foreclosures, which can vary significantly in pricing dynamics. This aggregation could obscure distinct patterns and trends that are more apparent when analyzing these groups separately, leading to less precise insights for targeted strategic decision-making or market analysis.
Getting Started
Loading the required packages
First import the packages used for the original analysis and the packages required for the new analysis.
Loading the data into the R environment
data1 = read_csv("data/ResidentialTransaction20240308160536.csv")
data2 = read_csv("data/ResidentialTransaction20240308160736.csv")
data3 = read_csv("data/ResidentialTransaction20240308161009.csv")
data4 = read_csv("data/ResidentialTransaction20240308161109.csv")
data5 = read_csv("data/ResidentialTransaction20240414220633.csv")
combined_data <- bind_rows(data1, data2, data3, data4, data5)
glimpse(combined_data)Rows: 26,806
Columns: 21
$ `Project Name` <chr> "THE REEF AT KING'S DOCK", "URBAN TREASU…
$ `Transacted Price ($)` <dbl> 2317000, 1823500, 1421112, 1258112, 1280…
$ `Area (SQFT)` <dbl> 882.65, 882.65, 1076.40, 1033.34, 871.88…
$ `Unit Price ($ PSF)` <dbl> 2625, 2066, 1320, 1218, 1468, 1767, 1095…
$ `Sale Date` <chr> "01 Jan 2023", "02 Jan 2023", "02 Jan 20…
$ Address <chr> "12 HARBOURFRONT AVENUE #05-32", "205 JA…
$ `Type of Sale` <chr> "New Sale", "New Sale", "New Sale", "New…
$ `Type of Area` <chr> "Strata", "Strata", "Strata", "Strata", …
$ `Area (SQM)` <dbl> 82.0, 82.0, 100.0, 96.0, 81.0, 308.7, 42…
$ `Unit Price ($ PSM)` <dbl> 28256, 22238, 14211, 13105, 15802, 19015…
$ `Nett Price($)` <chr> "-", "-", "-", "-", "-", "-", "-", "-", …
$ `Property Type` <chr> "Condominium", "Condominium", "Executive…
$ `Number of Units` <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ Tenure <chr> "99 yrs from 12/01/2021", "Freehold", "9…
$ `Completion Date` <chr> "Uncompleted", "Uncompleted", "Uncomplet…
$ `Purchaser Address Indicator` <chr> "HDB", "Private", "HDB", "HDB", "HDB", "…
$ `Postal Code` <chr> "097996", "419535", "269343", "269294", …
$ `Postal District` <chr> "04", "14", "27", "27", "28", "19", "10"…
$ `Postal Sector` <chr> "09", "41", "26", "26", "79", "54", "27"…
$ `Planning Region` <chr> "Central Region", "East Region", "North …
$ `Planning Area` <chr> "Bukit Merah", "Bedok", "Yishun", "Yishu…
Preparing the data
Clean and process data
library(dplyr)
library(lubridate)
data_cleaned <- combined_data %>%
mutate(`Sale Date` = dmy(`Sale Date`)) %>%
mutate(`Area (SQM)` = as.numeric(gsub(",", "", `Area (SQM)`))) %>%
mutate(`Unit Price ($ PSM)` = as.numeric(gsub("\\$", "", gsub(",", "", `Unit Price ($ PSM)`)))) %>%
mutate(`Area (SQM)` = ifelse(is.na(`Area (SQM)`), 0, `Area (SQM)`)) %>%
mutate(Area_Category = cut(
`Area (SQM)`,
breaks = c(0, 100, 200, 300, 400, Inf),
labels = c("<100", "100-200", "200-300", "300-400", ">400"),
include.lowest = TRUE
))
data_cleaned <- data_cleaned %>%
mutate(Quarter = paste(year(`Sale Date`),
ifelse(month(`Sale Date`) %in% c(1, 2, 3), "Q1",
ifelse(month(`Sale Date`) %in% c(4, 5, 6), "Q2",
ifelse(month(`Sale Date`) %in% c(7, 8, 9), "Q3", "Q4"))), sep = "-"))
str(data_cleaned)tibble [26,806 × 23] (S3: tbl_df/tbl/data.frame)
$ Project Name : chr [1:26806] "THE REEF AT KING'S DOCK" "URBAN TREASURES" "NORTH GAIA" "NORTH GAIA" ...
$ Transacted Price ($) : num [1:26806] 2317000 1823500 1421112 1258112 1280000 ...
$ Area (SQFT) : num [1:26806] 883 883 1076 1033 872 ...
$ Unit Price ($ PSF) : num [1:26806] 2625 2066 1320 1218 1468 ...
$ Sale Date : Date[1:26806], format: "2023-01-01" "2023-01-02" ...
$ Address : chr [1:26806] "12 HARBOURFRONT AVENUE #05-32" "205 JALAN EUNOS #08-02" "29 YISHUN CLOSE #08-10" "45 YISHUN CLOSE #07-42" ...
$ Type of Sale : chr [1:26806] "New Sale" "New Sale" "New Sale" "New Sale" ...
$ Type of Area : chr [1:26806] "Strata" "Strata" "Strata" "Strata" ...
$ Area (SQM) : num [1:26806] 82 82 100 96 81 ...
$ Unit Price ($ PSM) : num [1:26806] 28256 22238 14211 13105 15802 ...
$ Nett Price($) : chr [1:26806] "-" "-" "-" "-" ...
$ Property Type : chr [1:26806] "Condominium" "Condominium" "Executive Condominium" "Executive Condominium" ...
$ Number of Units : num [1:26806] 1 1 1 1 1 1 1 1 1 1 ...
$ Tenure : chr [1:26806] "99 yrs from 12/01/2021" "Freehold" "99 yrs from 15/02/2021" "99 yrs from 15/02/2021" ...
$ Completion Date : chr [1:26806] "Uncompleted" "Uncompleted" "Uncompleted" "Uncompleted" ...
$ Purchaser Address Indicator: chr [1:26806] "HDB" "Private" "HDB" "HDB" ...
$ Postal Code : chr [1:26806] "097996" "419535" "269343" "269294" ...
$ Postal District : chr [1:26806] "04" "14" "27" "27" ...
$ Postal Sector : chr [1:26806] "09" "41" "26" "26" ...
$ Planning Region : chr [1:26806] "Central Region" "East Region" "North Region" "North Region" ...
$ Planning Area : chr [1:26806] "Bukit Merah" "Bedok" "Yishun" "Yishun" ...
$ Area_Category : Factor w/ 5 levels "<100","100-200",..: 1 1 1 1 1 4 5 2 1 2 ...
$ Quarter : chr [1:26806] "2023-Q1" "2023-Q1" "2023-Q1" "2023-Q1" ...
Validate data according to original methods
duplicate <- combined_data %>%
group_by_all() %>%
filter(n()>1) %>%
ungroup()duplicate# A tibble: 0 × 21
# ℹ 21 variables: Project Name <chr>, Transacted Price ($) <dbl>,
# Area (SQFT) <dbl>, Unit Price ($ PSF) <dbl>, Sale Date <chr>,
# Address <chr>, Type of Sale <chr>, Type of Area <chr>, Area (SQM) <dbl>,
# Unit Price ($ PSM) <dbl>, Nett Price($) <chr>, Property Type <chr>,
# Number of Units <dbl>, Tenure <chr>, Completion Date <chr>,
# Purchaser Address Indicator <chr>, Postal Code <chr>,
# Postal District <chr>, Postal Sector <chr>, Planning Region <chr>, …
F1 <- ggplot(combined_data, aes(x = `Property Type`)) +
geom_bar_interactive(aes(fill = `Planning Region`), position = "dodge") +
labs(x = "Property Type", y = "Frequency",
title = "Frequency of Property Types by Planning Region") +
facet_wrap(~ `Planning Region`, scales = "free") +
theme_stata(base_size = 2.5)
theme(axis.text.x = element_text(angle = 45, hjust = 1)) List of 1
$ axis.text.x:List of 11
..$ family : NULL
..$ face : NULL
..$ colour : NULL
..$ size : NULL
..$ hjust : num 1
..$ vjust : NULL
..$ angle : num 45
..$ lineheight : NULL
..$ margin : NULL
..$ debug : NULL
..$ inherit.blank: logi FALSE
..- attr(*, "class")= chr [1:2] "element_text" "element"
- attr(*, "class")= chr [1:2] "theme" "gg"
- attr(*, "complete")= logi FALSE
- attr(*, "validate")= logi TRUE
F1
DataViz Makeover
Step 1: time analysis
The absence of data from the previous year’s four quarters in the scatter plot analyzing the relationship between transacted prices and area poses certain challenges. Primarily, it limits the depth of market trend analysis over an extended period, potentially overlooking seasonal fluctuations and economic cycles that significantly influence real estate markets. Without this broader temporal context, the analysis might give undue emphasis to short-term variations or isolated events, potentially leading to a somewhat narrow perspective on market dynamics. Such a limitation curtails the capacity to observe and interpret long-term trends and the cyclical nature of the housing market, which are essential for comprehensive statistical analysis and forecasting.
Furthermore, the lack of previous year data restricts year-over-year comparative analysis, crucial for evaluating market condition changes and understanding the influence of external economic factors like interest rate shifts or broader economic changes. For investors and policymakers, the unavailability of a more extended data range may hinder informed strategic decision-making. It limits the development of predictive models that rely on historical data to accurately project future market behaviors, thereby subtly reducing the analysis’s overall utility in supporting nuanced business and investment decisions.
To address the challenges of a limited temporal analysis, I have enhanced the dataset by incorporating data from the previous year and categorizing it by quarters. This addition allows for a more nuanced exploration of the changes and trends over time. By analyzing the data on a quarterly basis, we can better understand the seasonal fluctuations and the impact of economic cycles on the real estate market.
library(plotly)
P3_interactive <- plot_ly(data = data_cleaned, x = ~`Area (SQFT)`, y = ~`Transacted Price ($)`, type = 'scatter', mode = 'markers',
hoverinfo = 'text',
transforms = list(
list(
type = 'filter',
target = ~Quarter,
operation = '=',
value = unique(data_cleaned$Quarter)[1]
)
),
text = ~paste("Price: ", `Transacted Price ($)`, "<br>Area: ", `Area (SQFT)`, "<br>Quarter: ", Quarter)) %>%
layout(title = 'Scatter Plot of Transacted Price vs. Area',
xaxis = list(title = 'Area (SQFT)'),
yaxis = list(title = 'Transacted Price ($)'),
sliders = list(list(
active = 0,
currentvalue = list(prefix = "Quarter: "),
steps = lapply(unique(data_cleaned$Quarter), function(q) {
list(label = q, method = "restyle", args = list("transforms[0].value", q))
})
)))P3_interactiveThe interactive scatter plot offers several advantages over the static scatter plot with a trend line, particularly in terms of user engagement and analytical depth. Firstly, its interactivity enhances user experience by allowing for the exploration of individual data points through tooltips and dynamic elements such as sliders for temporal data filtering. This interactivity not only makes the data more accessible but also enables users to perform on-the-fly analysis of specific time periods without the need to switch between different plots. Moreover, the ability to zoom in and navigate through dense clusters of data points helps in examining detailed patterns that might be obscured in a static plot. This dynamic functionality is especially valuable in digital platforms where users expect an interactive and engaging data visualization experience.
Step 2: Type of Sale
Failing to categorize data by “Type of Sale” in an analysis of real estate transactions introduces significant limitations that can obscure crucial insights into different market segments. Without this categorization, the analysis merges various types of transactions, such as new sales, resales, and sub-sales, into a single aggregated dataset. This aggregation can lead to a generalized overview that fails to capture the distinct behaviors and trends associated with each sale type.
Firstly, different sale types often exhibit unique pricing patterns, demand cycles, and buyer preferences. For instance, new sales might be influenced by developer promotions and economic incentives, while resales are impacted more by the existing housing market conditions. Sub-sales, involving properties sold before their construction completion, might fluctuate based on speculative market sentiments. Without distinguishing these types, strategic decision-making becomes challenging as the nuanced dynamics of the market are not adequately represented.
Moreover, policies and marketing strategies tailored to specific sale types cannot be effectively formulated or implemented without a clear understanding of the particular characteristics and needs of each segment. For example, marketing strategies that are effective for new developments might not work for resales. Therefore, by not categorizing data by “Type of Sale,” the analysis loses the potential to guide targeted interventions and optimize resource allocation, potentially leading to less effective strategies and missed opportunities in the market.
To enhance our analysis, I’ve processed the data to categorize it by different “Type of Sale.” This categorization allows us to delve into the specific characteristics and trends of new sales, resales, and sub-sales separately.
library(plotly)
P3_interactive <- plot_ly(data = data_cleaned, x = ~`Area (SQFT)`, y = ~`Transacted Price ($)`, type = 'scatter', mode = 'markers',
hoverinfo = 'text',
transforms = list(
list(
type = 'filter',
target = ~Quarter,
operation = '=',
value = unique(data_cleaned$Quarter)[1]
),
list(
type = 'filter',
target = ~`Type of Sale`,
operation = '=',
value = unique(data_cleaned$`Type of Sale`)[1]
)
),
text = ~paste("Price: ", `Transacted Price ($)`, "<br>Area: ", `Area (SQFT)`, "<br>Quarter: ", Quarter, "<br>Type of Sale: ", `Type of Sale`)) %>%
layout(title = 'Scatter Plot of Transacted Price vs. Area',
xaxis = list(title = 'Area (SQFT)'),
yaxis = list(title = 'Transacted Price ($)'),
updatemenus = list(
list(
type = "dropdown",
direction = "down",
showactive = TRUE,
buttons = lapply(unique(data_cleaned$`Type of Sale`), function(type) {
list(
method = "restyle",
args = list("transforms[1].value", type),
label = type
)
})
)
),
sliders = list(list(
active = 0,
currentvalue = list(prefix = "Quarter: "),
steps = lapply(unique(data_cleaned$Quarter), function(q) {
list(
label = q,
method = "restyle",
args = list("transforms[0].value", q)
)
})
)))P3_interactiveThe updated interactive scatter plot enhances data analysis capabilities significantly by incorporating dropdown menus for category-based filtering, such as transaction types (New Sale, Resale, Sub Sale). This feature allows users to segment the dataset on the fly, enabling a focused examination of trends and patterns specific to each category. Such granularity is invaluable for users needing to make nuanced assessments of market dynamics. Furthermore, the plot’s clear presentation even in high-density areas ensures that all data points are easily visible and distinguishable, avoiding any visual confusion and facilitating precise analysis. Enhanced interactivity, provided by the dropdown filters, improves the user experience by allowing dynamic and complex data segmentation within the visualization interface itself. This makes the plot a highly effective tool for stakeholders requiring a comprehensive and detailed view of various market conditions without the need for additional tools or programming efforts.